package com.wisnews.util.spider;

import com.wisnews.service.sec.INewsService;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;

import java.util.List;

/**
 * @Description：
 * @Author: yuyd
 * @Email: yyd8358@foxmail.com
 * @Date: 2020/12/28 11:05
 * @Version 1.0
 */
@Component
@Slf4j
public class QjnuWebMagic implements PageProcessor {
    private Site site = Site.me()
            .setRetryTimes(3)
            .setSleepTime(100)
            .addHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36")
            .addHeader("Accept-Language","en-US,en;q=0.5")
            .setCharset("utf-8");

    @Autowired
    private INewsService newsService;

    @Override
    public void process(Page page) {

        //添加文章详情爬虫页面
        List<String> alllink = page.getHtml().links().regex("(https://www.qjnu.edu.cn/contents/\\w+/\\w+\\.html)").all();
        page.addTargetRequests(alllink);
        log.info("爬虫开始，获取链接总数"+alllink.size());
        System.out.println(page);
        Selectable url = page.getUrl();
        page.putField("likes",alllink);

    }


    @Override
    public Site getSite() {
        return site;
    }


}
